In [3]:
%pylab inline
import re
import math
import string
from collections import Counter
from __future__ import division
In [6]:
TEXT= file('big.txt').read()
len(TEXT)
Out[6]:
In [8]:
# lets break the text into tokens.
def tokens(text):
"List all the word tokens in a text. Normlaize to lowercase"
return re.findall('[a-z]+',text.lower())
In [9]:
tokens('this is a test, 1, 2 3, > This is')
Out[9]:
In [12]:
WORDS=tokens(TEXT)
len(WORDS)
Out[12]:
In [14]:
# Print first 10 words:
print (WORDS[:10])
In [15]:
# Bag of Words model
def sample(bag, n=10):
"Sample a random n-word sentence from the model described by bag of words"
return ' '.join(random.choice(bag) for _ in range(n))
In [17]:
sample(WORDS)
Out[17]:
In [20]:
# another representation of bag of words is a coutner, which is a dictionary.
Counter(tokens("Is this a text? It is test!!"))
Out[20]:
In [22]:
COUNTS=Counter(WORDS)
print COUNTS.most_common(10)
In [23]:
for w in tokens('the rare and neverbeforeseen words'):
print COUNTS[w], w
In [29]:
# Zipf Law, the nth most frequent word appears with a frequency of about 1/n of the most frequent word.
M= COUNTS['the']
#print M
yscale('log'); xscale('log'); title("Frequency of n-th most frequenct word and 1/n line. ")
plot([c for (w,c) in COUNTS.most_common()])
plot([M/i for i in range (1, len(COUNTS)+1)])
Out[29]:
In [31]:
def correct(word):
"Find the best spelling correction for this word. "
# prefer edit distance 0, then 1, then 2 ; otherwise default to word itself
candidates = (known(edits0(word)) or
known(edits1(word)) or
known (edits2(word)) or
[word])
return max(candidates, key=COUNTS.get)
In [32]:
def known(words):
"Returns the subset of words that are actually in the dictionary"
return {w for w in words if w in COUNTS}
def edits0(word):
"Return all strings that are zero edits away from word (i.e. just word itself)"
return {word}
def edits2(word):
"Returns all strings that are two edits away from this word. "
return {e2 for e1 in edits1(word) for e2 in edits1(e1)}
In [57]:
alphabet = 'abcdefghijklmnopqrstuvwxyz'
def edits1(word="test"):
"returns all strings that are one edit away from this word. "
pairs = splits(word)
deletes = [a+b[1:] for (a,b) in pairs if b]
transposes = [a+b[1] + b[0] + b[2:] for (a,b) in pairs if len(b) >1]
replaces = [a +c + b[1:] for (a,b) in pairs for c in alphabet]
inserts = [a+c+b for (a,b) in pairs for c in alphabet]
return set(deletes+transposes+replaces+inserts)
def splits(word):
"return a list of all possible (first, rest) pairs that comprise word. "
return [(word[:i], word[i:]) for i in range(len(word)+1)]
In [63]:
print edits0('wird')
In [64]:
print edits1('wird')
In [70]:
#print edits2('wird')
print len(edits1('wird'))
print len(edits2('wird'))
In [71]:
map(correct, tokens('Speling errurs in somethink. Whutever; unusuel misteakes everyware?'))
Out[71]:
In [76]:
def correct_text(text):
"Correct all the words within a text, returns corrected text"
return re.sub('[a-zA-Z]+', correct_match,text)
def correct_match(match):
"Spell-correct word in match, and preserve case : upper/lower/title"
word=match.group()
return case_of(word) (correct (word.lower()))
def case_of (text):
"returns the case-function appropriate for text: upper, lower, title, or just str"
return (str.upper if text.isupper() else
str.lower if text.islower() else
str.title if text.istitle() else
str)
In [77]:
map(case_of, ['UPPER', 'lower', 'Title', 'CamelCase'])
Out[77]:
In [79]:
correct_text('Spelling errurs IN Somethink.')
Out[79]:
In [ ]:
In [80]:
# From count to probablities
In [102]:
def pdist(counter):
"Make a probability disturbition, given evidence from a Counter"
N=sum(counter.values())
return lambda x: counter[x]/N
Pword=pdist(COUNTS)
In [103]:
for w in tokens("the is most common word"):
print Pword(w), w
In [104]:
#Memoization function
def memo(f):
cache={}
def fmemo(*args):
if args not in cache:
cache[args]=f(*args)
return cache[args]
fmemo.cache=cache
return fmemo
In [105]:
max(len(w) for w in COUNTS)
Out[105]:
In [106]:
def splits(text,start=0, L=20):
"returns a list of all (first,rest) paris ; start <=len(first)<=L"
return [(text[:i], text[i:]) for i in range(start,min(len(text),L)+1)]
In [107]:
print splits('word')
In [123]:
def Pwords(words):
"Probability of words, assuming each word is independent of others."
return product(Pword(w) for w in words)
def product(nums):
"Multiply the numbers together. (Like `sum`, but with multiplication.)"
result = 1
for x in nums:
result *= x
return result
In [124]:
@memo
def segment(text):
"return a list of words that is the most probable segmentation of text. "
if not text:
return []
else :
candidates=([first] + segment(rest) for (first , rest) in splits(text,1))
return max(candidates, key=Pwords)
In [125]:
segment('choosespain')
Out[125]:
In [126]:
decl = ('wheninthecourseofhumaneventsitbecomesnecessaryforonepeople' +
'todissolvethepoliticalbandswhichhaveconnectedthemwithanother' +
'andtoassumeamongthepowersoftheearththeseparateandequalstation' +
'towhichthelawsofnatureandofnaturesgodentitlethem')
In [127]:
print(segment(decl))
In [144]:
# Big data for ngrams
def load_counts(filename, sep='\t'):
"returns a counter initialized from key-value pairs one on each line of filename"
C=Counter()
for line in open(filename):
line=line.strip()
key,count=line.split(sep)
C[key]=int(count)
return C
In [147]:
COUNTS1=load_counts('count_1w.txt')
COUNTS2=load_counts('count_2w.txt')
#print COUNTS2.most_common(30)
#print COUNTS1.most_common(30)
P1w=pdist(COUNTS1)
P2w=pdist(COUNTS2)
In [149]:
P1w('hello')
Out[149]:
In [159]:
def pWords(words):
"Probability of word sequence assuming each word is independent of other words"
return product(P1w(w) for w in words)
def cPword(word,prev):
"conditional prob of word given prev word. "
bigram= prev + ' ' + word
if P2w(bigram) > 0 and P1w(prev) > 0:
return P2w(bigram)/P1w(prev)
else:
# average the back-ff value and zero
return P1w(word)/2
def Pwords2(words,prev='<S>'):
return product(cPword(w,(prev if i==0 else words[i-1])) for (i,w) in enumerate(words) )
In [160]:
print Pwords2(tokens('this is a test'))
In [ ]:
In [170]:
# lets create segment2 function which selects best function on candidate.
@memo
def segment2(text, prev="<S>"):
if not text:
return []
else :
candidates=([first] + segment2(rest,first) for (first,rest) in splits(text,1))
return max(candidates , key= lambda words: Pwords2(words,prev))
In [ ]:
In [173]:
print segment2('choosespain')
print segment2('speedofart')
In [176]:
import random
n=100
seen=set()
for i in range(n):
throw=random.randint(0,7)
seen.add(throw)
if(len(seen==6)):
In [ ]: